{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"../datasets/Google.csv\")\n",
    "df.index = pd.DatetimeIndex(df['Date'].values)\n",
    "close = df[\"Close\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_embargo_times(times, pct_embargo):\n",
    "    step = int(times.shape[0] * pct_embargo)\n",
    "    if step == 0:\n",
    "        embg_times = pd.Series(times, index=times)\n",
    "    else:\n",
    "        embg_times = pd.Series(times[step:], index=times[:-step])\n",
    "        embg_times = embg_times.append(pd.Series(times[-1], index=times[:-step]))\n",
    "    return embg_times"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2004-08-19   2004-10-04\n",
       "2004-08-20   2004-10-05\n",
       "2004-08-23   2004-10-06\n",
       "2004-08-24   2004-10-07\n",
       "2004-08-25   2004-10-08\n",
       "dtype: datetime64[ns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "embg_times = get_embargo_times(close.index, pct_embargo=0.01)\n",
    "embg_times.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 7.2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/tom/anaconda3/lib/python3.6/site-packages/finance_ml-0.1-py3.6.egg/finance_ml/labeling/barriers.py:37: FutureWarning: \n",
      "Passing list-likes to .loc or [] with any missing label will raise\n",
      "KeyError in the future, you can use .reindex() as an alternative.\n",
      "\n",
      "See the documentation here:\n",
      "http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike\n",
      "2018-07-06 19:26:30.557741 100.0% apply_ptslt1 done after 0.03 minutes. Remaining 0.0 minutes.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>t1</th>\n",
       "      <th>trgt</th>\n",
       "      <th>t1_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2004-08-24</th>\n",
       "      <td>2004-08-25</td>\n",
       "      <td>0.036396</td>\n",
       "      <td>t1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2004-08-25</th>\n",
       "      <td>2004-08-26</td>\n",
       "      <td>0.029930</td>\n",
       "      <td>t1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2004-08-31</th>\n",
       "      <td>2004-09-01</td>\n",
       "      <td>0.026605</td>\n",
       "      <td>t1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2004-09-02</th>\n",
       "      <td>2004-09-03</td>\n",
       "      <td>0.024097</td>\n",
       "      <td>t1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2004-09-07</th>\n",
       "      <td>2004-09-08</td>\n",
       "      <td>0.023610</td>\n",
       "      <td>t1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   t1      trgt t1_type\n",
       "2004-08-24 2004-08-25  0.036396      t1\n",
       "2004-08-25 2004-08-26  0.029930      t1\n",
       "2004-08-31 2004-09-01  0.026605      t1\n",
       "2004-09-02 2004-09-03  0.024097      t1\n",
       "2004-09-07 2004-09-08  0.023610      t1"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from finance_ml.stats import get_daily_vol\n",
    "from finance_ml.labeling import get_t1, cusum_filter, get_events\n",
    "\n",
    "vol = get_daily_vol(close)\n",
    "sampled_idx = cusum_filter(close, vol)\n",
    "t1 = get_t1(close, sampled_idx, num_days=1)\n",
    "side =  None\n",
    "events = get_events(close, t_events=sampled_idx, trgt=vol,\n",
    "                       ptsl=[1, 2], t1=t1, side=side)\n",
    "events.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "index = events.index\n",
    "features_df = df.drop(columns=[\"Date\"]).dropna().loc[index]\n",
    "features = features_df\n",
    "label = events['t1_type'].loc[features_df.index]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.7217741498441944 0.015327005775199071\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import cross_val_score, KFold\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "scores = []\n",
    "for _ in range(10):   \n",
    "    clf = RandomForestClassifier()\n",
    "    kfold = KFold(n_splits=10, shuffle=False)\n",
    "    scores.append(cross_val_score(clf, features, label, cv=kfold))\n",
    "print(np.mean(scores), np.var(scores))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.7723824685002032 0.0013833184853011958\n"
     ]
    }
   ],
   "source": [
    "scores = []\n",
    "for _ in range(10):   \n",
    "    clf = RandomForestClassifier()\n",
    "    kfold = KFold(n_splits=10, shuffle=True)\n",
    "    scores.append(cross_val_score(clf, features, label, cv=kfold))\n",
    "print(np.mean(scores), np.var(scores))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Shffuling data introduces data leakage because of simlarity among neighborg, If you shuffle data uniformly, training data has more information that overlaps test data."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 7.3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 197,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Timestamp('2017-01-10 00:00:00')"
      ]
     },
     "execution_count": 197,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t1.max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 198,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection._split import _BaseKFold\n",
    "import time\n",
    "\n",
    "\n",
    "def get_train_times(t1, test_times):\n",
    "    trn = t1.copy(deep=True)\n",
    "    for i, j in test_times.iteritems():\n",
    "        df0 = trn[(i <= trn.index) & (trn.index <= j)].index\n",
    "        df1 = trn[(i <= trn) & (trn <= j)].index\n",
    "        df2 = trn[(trn.index <= i) & (j <= trn)].index\n",
    "        trn = trn.drop(df0.union(df1.union(df2)))\n",
    "    return trn\n",
    "\n",
    "\n",
    "class PurgedKFold(_BaseKFold):\n",
    "    def __init__(self, n_splits=3, t1=None, pct_embargo=0., purging=True):\n",
    "        if not isinstance(t1, pd.Series):\n",
    "            raise ValueError('Label through dates must be a pd.Series')\n",
    "        super(PurgedKFold, self).__init__(n_splits=n_splits, shuffle=False,\n",
    "                                          random_state=None)\n",
    "        self.t1 = t1\n",
    "        self.pct_embargo = pct_embargo\n",
    "        self.purging = purging\n",
    "        \n",
    "    def split(self, X, y=None, groups=None):\n",
    "        if (X.index == self.t1.index).sum() != len(self.t1):\n",
    "            raise ValueError('X and t1 must have the same index')\n",
    "        indices = np.arange(X.shape[0])\n",
    "        # Embargo width\n",
    "        embg_size = int(X.shape[0] * self.pct_embargo)\n",
    "        test_ranges = [(i[0], i[-1] + 1) for i in np.array_split(indices, self.n_splits)]\n",
    "        for st, end in test_ranges:\n",
    "            # Test data\n",
    "            test_indices = indices[st:end]\n",
    "            # Training data prior to test data\n",
    "            t0 = self.t1.index[st]\n",
    "            train_indices = self.t1.index.searchsorted(self.t1[self.t1 <= t0].index)\n",
    "            # Add training data after test data\n",
    "            max_t1_idx = self.t1.index.searchsorted(self.t1[test_indices].max())\n",
    "            if max_t1_idx < X.shape[0]:\n",
    "                train_indices = np.concatenate((train_indices, indices[max_t1_idx + embg_size:]))\n",
    "            # Purging\n",
    "            if self.purging:\n",
    "                train_t1 = t1.iloc[train_indices]\n",
    "                test_t1 = t1.iloc[test_indices]\n",
    "                train_t1 = get_train_times(train_t1, test_t1)\n",
    "                train_indices = self.t1.index.searchsorted(train_t1.index)\n",
    "            yield train_indices, test_indices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 199,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import log_loss, accuracy_score\n",
    "import numpy as np\n",
    "\n",
    "from finance_ml.sampling import get_sample_tw, get_num_co_events\n",
    "\n",
    "def cv_score(clf, X, y, sample_weight=None, scoring='neg_log_loss',\n",
    "             t1=None, n_splits=3, cv_gen=None, pct_embargo=0., purging=False):\n",
    "    if scoring not in ['neg_log_loss', 'accuracy']:\n",
    "        raise Exception('Wrong scoring method')\n",
    "    if cv_gen is None:\n",
    "        cv_gen = PurgedKFold(n_splits=n_splits, t1=t1,\n",
    "                             pct_embargo=pct_embargo,\n",
    "                             purging=purging)\n",
    "    scores = []\n",
    "    for train, test in cv_gen.split(X=X):\n",
    "        train_params = dict()\n",
    "        test_params = dict()\n",
    "        # Sample weight is an optional parameter\n",
    "        if sample_weight is not None:\n",
    "            train_params['sample_weight'] = sample_weight.iloc[train].values\n",
    "            test_params['sample_weight'] = sample_weight.iloc[test].values\n",
    "        clf_ = clf.fit(X=X.iloc[train, :], y=y.iloc[train], **train_params)\n",
    "        # Scoring\n",
    "        if scoring == 'neg_log_loss':\n",
    "            prob = clf_.predict_proba(X.iloc[test, :])\n",
    "            score_ = -log_loss(y.iloc[test], prob, labels=clf.classes_, **test_params)\n",
    "        else:\n",
    "            pred = clf_.predict(X.iloc[test, :])\n",
    "            score_ = accuracy_score(y.iloc[test], pred, **test_params)\n",
    "        scores.append(score_)\n",
    "    return np.array(scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 214,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-2.8218924493391286 0.3882551420305969\n",
      "CPU times: user 5.83 s, sys: 3.86 ms, total: 5.83 s\n",
      "Wall time: 5.83 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "clf = RandomForestClassifier()\n",
    "t1_ = t1.loc[features.index]\n",
    "\n",
    "scores = []\n",
    "for _ in range(100):\n",
    "    scores_ = cv_score(clf, features, label, pct_embargo=0.01, t1=t1_, purging=False)\n",
    "    scores.append(np.mean(scores_))\n",
    "print(np.mean(scores), np.var(scores))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 215,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-2.9363593364167415 0.737761782263656\n",
      "CPU times: user 5.94 s, sys: 7.84 ms, total: 5.95 s\n",
      "Wall time: 5.95 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "scores = []\n",
    "for _ in range(100):\n",
    "    scores_ = cv_score(clf, features, label, pct_embargo=0., t1=t1_, purging=False)\n",
    "    scores.append(np.mean(scores_))\n",
    "print(np.mean(scores), np.var(scores))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### With Sample Weights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 216,
   "metadata": {},
   "outputs": [],
   "source": [
    "n_co_events = get_num_co_events(close.index, t1, events.index)\n",
    "sample_weight = get_sample_tw(t1, n_co_events, events.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 217,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-2.8055327356602073 0.5953942615123814\n",
      "CPU times: user 6.02 s, sys: 0 ns, total: 6.02 s\n",
      "Wall time: 6.01 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "scores = []\n",
    "for _ in range(100):\n",
    "    scores_ = cv_score(clf, features, label, sample_weight=sample_weight,\n",
    "                       pct_embargo=0.01, t1=t1_, purging=False)\n",
    "    scores.append(np.mean(scores_))\n",
    "print(np.mean(scores), np.var(scores))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 218,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-2.873647648777405 0.9769345931532711\n",
      "CPU times: user 8.24 s, sys: 11.6 ms, total: 8.25 s\n",
      "Wall time: 8.25 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "scores = []\n",
    "for _ in range(100):\n",
    "    scores_ = cv_score(clf, features, label, sample_weight=sample_weight,\n",
    "                       pct_embargo=0., t1=t1_, purging=False)\n",
    "    scores.append(np.mean(scores_))\n",
    "print(np.mean(scores), np.var(scores))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
